require(quanteda)
require(newsmap)
source("functions.R")

toks <- readRDS("data_tokens_he.RDS") %>% 
        tokens_remove('\\d', valuetype = 'regex', min_nchar = 3)

dfmt <- dfm(toks, remove_padding = TRUE) %>% 
        dfm_trim(min_termfreq = 25) %>% 
        dfm_group()

toks_key <- tokens_lookup(toks, data_dictionary_newsmap_he, levels = 3, 
                          nested_scope = "dictionary")
dfmt_key <- dfm(toks_key) %>% 
            dfm_group()

newsmap <- textmodel_newsmap(dfmt, dfmt_key)

pred <- predict(newsmap, confidence.fit = TRUE)
dat <- data.frame(pred, docid = names(pred$class))
dat$class[dat$confidence.fit < 0] <- "il"
sort(table(dat$class))

mat <- predict(newsmap, type = "all")
dat$alone <- rowSums(mat > 0) <= 1

saveRDS(dat, "data_newsmap_he.RDS")

